1   package org.apache.lucene.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.Arrays;
22  
23  import org.apache.lucene.search.DocIdSetIterator;
24  
25  /**
26   * A bit set that only stores longs that have at least one bit which is set.
27   * The way it works is that the space of bits is divided into blocks of
28   * 4096 bits, which is 64 longs. Then for each block, we have:<ul>
29   * <li>a long[] which stores the non-zero longs for that block</li>
30   * <li>a long so that bit <tt>i</tt> being set means that the <code>i-th</code>
31   *     long of the block is non-null, and its offset in the array of longs is
32   *     the number of one bits on the right of the <code>i-th</code> bit.</li></ul>
33   *
34   * @lucene.internal
35   */
36  public class SparseFixedBitSet extends BitSet implements Bits, Accountable {
37  
38    private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SparseFixedBitSet.class);
39    private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]);
40    private static final int MASK_4096 = (1 << 12) - 1;
41  
42    private static int blockCount(int length) {
43      int blockCount = length >>> 12;
44      if ((blockCount << 12) < length) {
45        ++blockCount;
46      }
47      assert (blockCount << 12) >= length;
48      return blockCount;
49    }
50  
51    final long[] indices;
52    final long[][] bits;
53    final int length;
54    int nonZeroLongCount;
55    long ramBytesUsed;
56  
57    /** Create a {@link SparseFixedBitSet} that can contain bits between
58     *  <code>0</code> included and <code>length</code> excluded. */
59    public SparseFixedBitSet(int length) {
60      if (length < 1) {
61        throw new IllegalArgumentException("length needs to be >= 1");
62      }
63      this.length = length;
64      final int blockCount = blockCount(length);
65      indices = new long[blockCount];
66      bits = new long[blockCount][];
67      ramBytesUsed = BASE_RAM_BYTES_USED
68          + RamUsageEstimator.shallowSizeOf(indices)
69          + RamUsageEstimator.shallowSizeOf(bits);
70    }
71  
72    @Override
73    public int length() {
74      return length;
75    }
76  
77    private boolean consistent(int index) {
78      assert index >= 0 && index < length : "index=" + index + ",length=" + length;
79      return true;
80    }
81  
82    @Override
83    public int cardinality() {
84      int cardinality = 0;
85      for (long[] bitArray : bits) {
86        if (bitArray != null) {
87          for (long bits : bitArray) {
88            cardinality += Long.bitCount(bits);
89          }
90        }
91      }
92      return cardinality;
93    }
94  
95    @Override
96    public int approximateCardinality() {
97      // we are assuming that bits are uniformly set and use the linear counting
98      // algorithm to estimate the number of bits that are set based on the number
99      // of longs that are different from zero
100     final int totalLongs = (length + 63) >>> 6; // total number of longs in the space
101     assert totalLongs >= nonZeroLongCount;
102     final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros
103     // No need to guard against division by zero, it will return +Infinity and things will work as expected
104     final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs));
105     return (int) Math.min(length, estimate);
106   }
107 
108   @Override
109   public boolean get(int i) {
110     assert consistent(i);
111     final int i4096 = i >>> 12;
112     final long index = indices[i4096];
113     final int i64 = i >>> 6;
114     // first check the index, if the i64-th bit is not set, then i is not set
115     // note: this relies on the fact that shifts are mod 64 in java
116     if ((index & (1L << i64)) == 0) {
117       return false;
118     }
119 
120     // if it is set, then we count the number of bits that are set on the right
121     // of i64, and that gives us the index of the long that stores the bits we
122     // are interested in
123     final long bits = this.bits[i4096][Long.bitCount(index & ((1L << i64) - 1))];
124     return (bits & (1L << i)) != 0;
125   }
126 
127   private static int oversize(int s) {
128     int newSize = s + (s >>> 1);
129     if (newSize > 50) {
130       newSize = 64;
131     }
132     return newSize;
133   }
134 
135   /**
136    * Set the bit at index <tt>i</tt>.
137    */
138   public void set(int i) {
139     assert consistent(i);
140     final int i4096 = i >>> 12;
141     final long index = indices[i4096];
142     final int i64 = i >>> 6;
143     if ((index & (1L << i64)) != 0) {
144       // in that case the sub 64-bits block we are interested in already exists,
145       // we just need to set a bit in an existing long: the number of ones on
146       // the right of i64 gives us the index of the long we need to update
147       bits[i4096][Long.bitCount(index & ((1L << i64) - 1))] |= 1L << i; // shifts are mod 64 in java
148     } else if (index == 0) {
149       // if the index is 0, it means that we just found a block of 4096 bits
150       // that has no bit that is set yet. So let's initialize a new block:
151       insertBlock(i4096, i64, i);
152     } else {
153       // in that case we found a block of 4096 bits that has some values, but
154       // the sub-block of 64 bits that we are interested in has no value yet,
155       // so we need to insert a new long
156       insertLong(i4096, i64, i, index);
157     }
158   }
159 
160   private void insertBlock(int i4096, int i64, int i) {
161     indices[i4096] = 1L << i64; // shifts are mod 64 in java
162     assert bits[i4096] == null;
163     bits[i4096] = new long[] { 1L << i }; // shifts are mod 64 in java
164     ++nonZeroLongCount;
165     ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED;
166   }
167 
168   private void insertLong(int i4096, int i64, int i, long index) {
169     indices[i4096] |= 1L << i64; // shifts are mod 64 in java
170     // we count the number of bits that are set on the right of i64
171     // this gives us the index at which to perform the insertion
172     final int o = Long.bitCount(index & ((1L << i64) - 1));
173     final long[] bitArray = bits[i4096];
174     if (bitArray[bitArray.length - 1] == 0) {
175       // since we only store non-zero longs, if the last value is 0, it means
176       // that we alreay have extra space, make use of it
177       System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1);
178       bitArray[o] = 1L << i;
179     } else {
180       // we don't have extra space so we need to resize to insert the new long
181       final int newSize = oversize(bitArray.length + 1);
182       final long[] newBitArray = new long[newSize];
183       System.arraycopy(bitArray, 0, newBitArray, 0, o);
184       newBitArray[o] = 1L << i;
185       System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o);
186       bits[i4096] = newBitArray;
187       ramBytesUsed += RamUsageEstimator.sizeOf(newBitArray) - RamUsageEstimator.sizeOf(bitArray);
188     }
189     ++nonZeroLongCount;
190   }
191 
192   /**
193    * Clear the bit at index <tt>i</tt>.
194    */
195   public void clear(int i) {
196     assert consistent(i);
197     final int i4096 = i >>> 12;
198     final int i64 = i >>> 6;
199     and(i4096, i64, ~(1L << i));
200   }
201 
202   private void and(int i4096, int i64, long mask) {
203     final long index = indices[i4096];
204     if ((index & (1L << i64)) != 0) {
205       // offset of the long bits we are interested in in the array
206       final int o = Long.bitCount(index & ((1L << i64) - 1));
207       long bits = this.bits[i4096][o] & mask;
208       if (bits == 0) {
209         removeLong(i4096, i64, index, o);
210       } else {
211         this.bits[i4096][o] = bits;
212       }
213     }
214   }
215 
216   private void removeLong(int i4096, int i64, long index, int o) {
217     index &= ~(1L << i64);
218     indices[i4096] = index;
219     if (index == 0) {
220       // release memory, there is nothing in this block anymore
221       this.bits[i4096] = null;
222     } else {
223       final int length = Long.bitCount(index);
224       final long[] bitArray = bits[i4096];
225       System.arraycopy(bitArray, o + 1, bitArray, o, length - o);
226       bitArray[length] = 0L;
227     }
228     nonZeroLongCount -= 1;
229   }
230 
231   @Override
232   public void clear(int from, int to) {
233     assert from >= 0;
234     assert to <= length;
235     if (from >= to) {
236       return;
237     }
238     final int firstBlock = from >>> 12;
239     final int lastBlock = (to - 1) >>> 12;
240     if (firstBlock == lastBlock) {
241       clearWithinBlock(firstBlock, from & MASK_4096, (to - 1) & MASK_4096);
242     } else {
243       clearWithinBlock(firstBlock, from & MASK_4096, MASK_4096);
244       for (int i = firstBlock + 1; i < lastBlock; ++i) {
245         nonZeroLongCount -= Long.bitCount(indices[i]);
246         indices[i] = 0;
247         bits[i] = null;
248       }
249       clearWithinBlock(lastBlock, 0, (to - 1) & MASK_4096);
250     }
251   }
252 
253   // create a long that has bits set to one between from and to
254   private static long mask(int from, int to) {
255     return ((1L << (to - from) << 1) - 1) << from;
256   }
257 
258   private void clearWithinBlock(int i4096, int from, int to) {
259     int firstLong = from >>> 6;
260     int lastLong = to >>> 6;
261 
262     if (firstLong == lastLong) {
263       and(i4096, firstLong, ~mask(from, to));
264     } else {
265       assert firstLong < lastLong;
266       and(i4096, lastLong, ~mask(0, to));
267       for (int i = lastLong - 1; i >= firstLong + 1; --i) {
268         and(i4096, i, 0L);
269       }
270       and(i4096, firstLong, ~mask(from, 63));
271     }
272   }
273 
274   /** Return the first document that occurs on or after the provided block index. */
275   private int firstDoc(int i4096) {
276     long index = 0;
277     while (i4096 < indices.length) {
278       index = indices[i4096];
279       if (index != 0) {
280         final int i64 = Long.numberOfTrailingZeros(index);
281         return (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]);
282       }
283       i4096 += 1;
284     }
285     return DocIdSetIterator.NO_MORE_DOCS;
286   }
287 
288   @Override
289   public int nextSetBit(int i) {
290     assert i < length;
291     final int i4096 = i >>> 12;
292     final long index = indices[i4096];
293     final long[] bitArray = this.bits[i4096];
294     int i64 = i >>> 6;
295     int o = Long.bitCount(index & ((1L << i64) - 1));
296     if ((index & (1L << i64)) != 0) {
297       // There is at least one bit that is set in the current long, check if
298       // one of them is after i
299       final long bits = bitArray[o] >>> i; // shifts are mod 64
300       if (bits != 0) {
301         return i + Long.numberOfTrailingZeros(bits);
302       }
303       o += 1;
304     }
305     final long indexBits = index >>> i64 >>> 1;
306     if (indexBits == 0) {
307       // no more bits are set in the current block of 4096 bits, go to the next one
308       return firstDoc(i4096 + 1);
309     }
310     // there are still set bits
311     i64 += 1 + Long.numberOfTrailingZeros(indexBits);
312     final long bits = bitArray[o];
313     return (i64 << 6) | Long.numberOfTrailingZeros(bits);
314   }
315 
316   /** Return the last document that occurs on or before the provided block index. */
317   private int lastDoc(int i4096) {
318     long index;
319     while (i4096 >= 0) {
320       index = indices[i4096];
321       if (index != 0) {
322         final int i64 = 63 - Long.numberOfLeadingZeros(index);
323         final long bits = this.bits[i4096][Long.bitCount(index) - 1];
324         return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
325       }
326       i4096 -= 1;
327     }
328     return -1;
329   }
330 
331   @Override
332   public int prevSetBit(int i) {
333     assert i >= 0;
334     final int i4096 = i >>> 12;
335     final long index = indices[i4096];
336     final long[] bitArray = this.bits[i4096];
337     int i64 = i >>> 6;
338     final long indexBits = index & ((1L << i64) - 1);
339     final int o = Long.bitCount(indexBits);
340     if ((index & (1L << i64)) != 0) {
341       // There is at least one bit that is set in the same long, check if there
342       // is one bit that is set that is lower than i
343       final long bits = bitArray[o] & ((1L << i << 1) - 1);
344       if (bits != 0) {
345         return (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
346       }
347     }
348     if (indexBits == 0) {
349       // no more bits are set in this block, go find the last bit in the
350       // previous block
351       return lastDoc(i4096 - 1);
352     }
353     // go to the previous long
354     i64 = 63 - Long.numberOfLeadingZeros(indexBits);
355     final long bits = bitArray[o - 1];
356     return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
357   }
358 
359   /** Return the long bits at the given <code>i64</code> index. */
360   private long longBits(long index, long[] bits, int i64) {
361     if ((index & (1L << i64)) == 0) {
362       return 0L;
363     } else {
364       return bits[Long.bitCount(index & ((1L << i64) - 1))];
365     }
366   }
367 
368   private void or(final int i4096, final long index, long[] bits, int nonZeroLongCount) {
369     assert Long.bitCount(index) == nonZeroLongCount;
370     final long currentIndex = indices[i4096];
371     if (currentIndex == 0) {
372       // fast path: if we currently have nothing in the block, just copy the data
373       // this especially happens all the time if you call OR on an empty set
374       indices[i4096] = index;
375       this.bits[i4096] = Arrays.copyOf(bits, nonZeroLongCount);
376       this.nonZeroLongCount += nonZeroLongCount;
377       return;
378     }
379     final long[] currentBits = this.bits[i4096];
380     final long[] newBits;
381     final long newIndex = currentIndex | index;
382     final int requiredCapacity = Long.bitCount(newIndex);
383     if (currentBits.length >= requiredCapacity) {
384       newBits = currentBits;
385     } else {
386       newBits = new long[oversize(requiredCapacity)];
387     }
388     // we iterate backwards in order to not override data we might need on the next iteration if the
389     // array is reused
390     for (int i = Long.numberOfLeadingZeros(newIndex), newO = Long.bitCount(newIndex) - 1;
391         i < 64;
392         i += 1 + Long.numberOfLeadingZeros(newIndex << (i + 1)), newO -= 1) {
393       // bitIndex is the index of a bit which is set in newIndex and newO is the number of 1 bits on its right
394       final int bitIndex = 63 - i;
395       assert newO == Long.bitCount(newIndex & ((1L << bitIndex) - 1));
396       newBits[newO] = longBits(currentIndex, currentBits, bitIndex) | longBits(index, bits, bitIndex);
397     }
398     indices[i4096] = newIndex;
399     this.bits[i4096] = newBits;
400     this.nonZeroLongCount += nonZeroLongCount - Long.bitCount(currentIndex & index);
401   }
402 
403   private void or(SparseFixedBitSet other) {
404     for (int i = 0; i < other.indices.length; ++i) {
405       final long index = other.indices[i];
406       if (index != 0) {
407         or(i, index, other.bits[i], Long.bitCount(index));
408       }
409     }
410   }
411 
412   /**
413    * {@link #or(DocIdSetIterator)} impl that works best when <code>it</code> is dense
414    */
415   private void orDense(DocIdSetIterator it) throws IOException {
416     assertUnpositioned(it);
417     // The goal here is to try to take advantage of the ordering of documents
418     // to build the data-structure more efficiently
419     // NOTE: this heavily relies on the fact that shifts are mod 64
420     final int firstDoc = it.nextDoc();
421     if (firstDoc == DocIdSetIterator.NO_MORE_DOCS) {
422       return;
423     }
424     int i4096 = firstDoc >>> 12;
425     int i64 = firstDoc >>> 6;
426     long index = 1L << i64;
427     long currentLong = 1L << firstDoc;
428     // we store at most 64 longs per block so preallocate in order never to have to resize
429     long[] longs = new long[64];
430     int numLongs = 0;
431 
432     for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
433       final int doc64 = doc >>> 6;
434       if (doc64 == i64) {
435         // still in the same long, just set the bit
436         currentLong |= 1L << doc;
437       } else {
438         longs[numLongs++] = currentLong;
439 
440         final int doc4096 = doc >>> 12;
441         if (doc4096 == i4096) {
442           index |= 1L << doc64;
443         } else {
444           // we are on a new block, flush what we buffered
445           or(i4096, index, longs, numLongs);
446           // and reset state for the new block
447           i4096 = doc4096;
448           index = 1L << doc64;
449           numLongs = 0;
450         }
451 
452         // we are on a new long, reset state
453         i64 = doc64;
454         currentLong = 1L << doc;
455       }
456     }
457 
458     // flush
459     longs[numLongs++] = currentLong;
460     or(i4096, index, longs, numLongs);
461   }
462 
463   @Override
464   public void or(DocIdSetIterator it) throws IOException {
465     {
466       // specialize union with another SparseFixedBitSet
467       final SparseFixedBitSet other = BitSetIterator.getSparseFixedBitSetOrNull(it);
468       if (other != null) {
469         assertUnpositioned(it);
470         or(other);
471         return;
472       }
473     }
474 
475     // We do not specialize the union with a FixedBitSet since FixedBitSets are
476     // supposed to be used for dense data and sparse fixed bit sets for sparse
477     // data, so a sparse set would likely get upgraded by DocIdSetBuilder before
478     // being or'ed with a FixedBitSet
479 
480     if (it.cost() < indices.length) {
481       // the default impl is good for sparse iterators
482       super.or(it);
483     } else {
484       orDense(it);
485     }
486   }
487 
488   // AND and AND_NOT do not need much specialization here since this sparse set
489   // is supposed to be used on sparse data and the default AND/AND_NOT impl
490   // (leap frog) is efficient when at least one of the sets contains sparse data
491 
492   @Override
493   public void and(DocIdSetIterator it) throws IOException {
494     final SparseFixedBitSet other = BitSetIterator.getSparseFixedBitSetOrNull(it);
495     if (other != null) {
496       // if we are merging with another SparseFixedBitSet, a quick win is
497       // to clear up some blocks by only looking at their index. Then the set
498       // is sparser and the leap-frog approach of the parent class is more
499       // efficient. Since SparseFixedBitSet is supposed to be used for sparse
500       // sets, the intersection of two SparseFixedBitSet is likely very sparse
501       final int numCommonBlocks = Math.min(indices.length, other.indices.length);
502       for (int i = 0; i < numCommonBlocks; ++i) {
503         if ((indices[i] & other.indices[i]) == 0) {
504           this.nonZeroLongCount -= Long.bitCount(this.indices[i]);
505           this.indices[i] = 0;
506           this.bits[i] = null;
507         }
508       }
509     }
510     super.and(it);
511   }
512 
513   @Override
514   public long ramBytesUsed() {
515     return ramBytesUsed;
516   }
517 
518   @Override
519   public String toString() {
520     return "SparseFixedBitSet(size=" + length + ",cardinality=~" + approximateCardinality();
521   }
522 }